###############################################################################
## Generating Individual Speaker-Crisis Text Measures
## Schub, "Informing the Leader: Bureaucracies and International Crises", APSR
###############################################################################

## OUTLINE
 ## I.   Load data and dictionaries
 ## II.  Format text
 ## III. Generate uncertainty measures
 ## IV.  Generate content measures
 	## Train model (note: validation steps are in "Schub_BureaucracyLevelText_APSR.R")
 	## Apply model  


##################################
#### Load Libraries
##################################
library(foreign)
library(tm)
library(caret)
library(fastNaiveBayes)


#########################################################
#### I. Load Data and Dictionaries
#########################################################

## set appropriate working directory
setwd("")
## load base data
all<-read.csv("SpeakerLevel_Raw.csv")
## udict = uncertainty dictionary
load("dictionaries.RData")

#########################################################
#### II. Format Text
#########################################################

## Prepare text ##
test<-all$text
words<-as.character(test)

## Format with spaces where needed, remove punctuation, remove most common transcript filler words
text<-gsub(",",", ",words)
text<-gsub(".",". ",text, fixed=TRUE)
text<-gsub("?","? ",text, fixed=TRUE)
# text<-replace_contraction(text) # text<-replace_contraction(text) # *alternative pre-processing that addresses contractions; slightly alters the final corpus but produces essentially identical measures with a correlation above 0.998 (requires package "textclean")
text<-gsub("([-]|[[:punct:]])"," ", text)
text<-gsub("SPLIT"," ", text) #"split" is used to indicate break between distinct speech segments an individual contributed during a single meeting
text<-gsub("said"," ", text)

## Format as corpus, remove numbers, remove stopwords, and stem remaining words
corp<-Corpus(VectorSource(text))
corp<-tm_map(corp, content_transformer(tolower))
corp<-tm_map(corp, removeNumbers) 
corp<-tm_map(corp, stripWhitespace)
a <- tm_map(corp, removeWords, stopwords("english")) # this stopword file is at C:\Users\[username]\Documents\R\win-library\2.13\tm\stopwords #preprocess
a <- tm_map(a, stemDocument, language = "english") 
a <- tm_map(a , stripWhitespace) 

## Create DTM for a raw word count (pre-stopword removal)
dtw<-DocumentTermMatrix(corp, control=list(wordLengths=c(1,Inf))) 


##################################
#### III. Generate Uncertainty Measures
##################################

## Raw word count 
all$rawtotwords<-rowSums(as.matrix(dtw)) 

## Create DTM with processed texts and get a functional word count after pre-processing
adtm<-DocumentTermMatrix(a, control=list(wordLengths=c(1,Inf)))  
all$functotwords<-rowSums(as.matrix(adtm)) #function post-stop word removal word count

## Generate uncertainty measure
uobj<-DocumentTermMatrix(a,list(dictionary=udict))
all$uwords<-rowSums(as.matrix(uobj)) #uncertainty dictionary word count per text
all$uper<-all$uwords/all$functotwords #divide uncertain terms by total terms
all$uper100<-all$uper*100 


##################################
#### IV. Generate Content Measures
##################################

##############################################
### Main measures used throughout manuscript

##############
## Load and clean training data   
trdata<-read.csv("TrainingSet_Raw.csv")
trtext<-as.character(trdata$text)

## Format with spaces where needed and remove punctuation
trtext<-gsub(",",", ",trtext)
trtext<-gsub(".",". ",trtext, fixed=TRUE)
trtext<-gsub("?","? ",trtext, fixed=TRUE)
# trtext<-replace_contraction(trtext) # text<-replace_contraction(text) # *alternative pre-processing that addresses contractions; slightly alters the final training set but produces essentially identical measures with a correlation above 0.998 (requires package "textclean")
trtext<-gsub("([-]|[[:punct:]])"," ", trtext)

## Create and Pre-Process Training Data Corpus
trcorp<-Corpus(VectorSource(trtext))
trcorp<-tm_map(trcorp, content_transformer(tolower))
trcorp<-tm_map(trcorp, removeNumbers)

## Create a document-term matrix and df for training data, stem and remove stop words
trdtm <- DocumentTermMatrix(trcorp, control = list(stemming = T, stopwords = T, minWordLength = 3))

## Remove sparse terms
sparse=removeSparseTerms(trdtm,0.98)

## Make data frame
trdtm <- as.data.frame(as.matrix(sparse))
trdtm <- apply(trdtm,MAR=2,as.integer)

## Remove common transcript filler words
trdf<-as.data.frame(trdtm)
trdftemp<-subset(trdf,select=-c(split, said))
trdtm <-as.matrix(trdftemp)

## Add rownames to dtm
trdtmDocNames<-trdata$files
rownames(trdtm)<-trdtmDocNames
trdtm<-as.data.frame(trdtm)

## Vector with training set hand-coded document classes
coding<-trdata$political
coding2 <- rep(NA,length(coding))
coding2[coding==1] <- "political"
coding2[coding!=1] <- "military"

## Train naive bayes
iv<-trdtm
dv<-coding2
m<-fnb.multinomial(iv,dv,laplace=1)


############################
## Prepare applied data for use  

## Create a document-term matrix and df
dtm <- DocumentTermMatrix(corp, control = list(stemming = T, stopwords = T, minWordLength = 3))
dtm <- as.data.frame(as.matrix(dtm))
dtm <- apply(dtm,MAR=2,as.integer)
df<-as.data.frame(dtm)
dtm<-as.matrix(df)

## add rownames to the dtm
dtmDocNames <- c(as.character(all$id))
rownames(dtm) <- dtmDocNames
dtmapplied<-dtm

## subset applied set to terms in training set
trterms<-colnames(trdtm)
dtmsub<-dtmapplied[,colnames(dtmapplied) %in% c(trterms)]
dim(dtmsub)

## applied text score: continuous
preda<-predict(m,dtmsub,type="rawprob") 
preda<-as.data.frame(preda)
predaratio<-preda$political/preda$military
predaflip<-(1-predaratio)+1
all$fnbflip<-predaflip
fnbflip30<-all$fnbflip[all$functotwords>=30] #standardized scores using only texts with at least 30 words
all$fnbstd<-ifelse(all$functotwords>=30,(all$fnbflip-min(fnbflip30))/(max(fnbflip30)-min(fnbflip30)),NA)

## appied text score: binary
predabi<-NA
predabi[predaflip>1 & !is.na(predaflip)]<-"political"
predabi[predaflip<1 & !is.na(predaflip)]<-"military"
all$fnbclass<-predabi
all$fnbpolitical<-ifelse(all$fnbclass=="political",1,0)
all$fnbpolitical[is.na(all$fnbclass)]<-NA

## generate codings for "expert" type when ideal type bureaucracy discusses its core content
all$competent2<-0
all$competent2[all$state==1 & all$fnbpolitical==1]<-1    
all$competent2[(all$role=="defense"|all$role=="jcs") & all$fnbpolitical ==0]<-1 
all$competent2[(all$role=="cia"|all$role=="executive")]<-NA


##############################################
### Alternative measures excluding leader names

## Create alternative training data without names 
trdftemp_nn<-subset(trdf,select=-c(split, said, nasser, castro, chiang, khrushchev)) #remove leader names that appear in training set
trdtm_nn <-as.matrix(trdftemp_nn)

## Add rownames to dtm
trdtmDocNames<-trdata$files
rownames(trdtm_nn)<-trdtmDocNames
trdtm_nn<-as.data.frame(trdtm_nn)
trdtm_nonames<-trdtm_nn

## Train naive bayes
ivn<-trdtm_nonames
mn<-fnb.multinomial(ivn,dv,laplace=1)


################
## Apply Model using training data that excludes leader names

## subset applied set to terms in training set
trtermsn<-colnames(trdtm_nonames)
dtmsubn<-dtmapplied[,colnames(dtmapplied) %in% c(trtermsn)]


## applied text score: continuous (excludes leader names)
predan<-predict(mn,dtmsubn,type="rawprob") 
predan<-as.data.frame(predan)
predaration<-predan$political/predan$military
predaflipn<-(1-predaration)+1
all$fnbflip_nonames<-predaflipn
fnbflip30n<-all$fnbflip_nonames[all$functotwords>=30] #standardized scores using only texts with at least 30 words
all$fnbstd_nonames<-ifelse(all$functotwords>=30,(all$fnbflip_nonames-min(fnbflip30n))/(max(fnbflip30n)-min(fnbflip30n)),NA) #generate continuous measure provided text has at least 30 words

## appied text score: binary (excludes leader names)
predabin<-NA
predabin[predaflipn>1 & !is.na(predaflipn)]<-"political"
predabin[predaflipn<1 & !is.na(predaflipn)]<-"military"
all$fnbclass_nonames<-predabin
all$fnbpolitical_nonames<-ifelse(all$fnbclass_nonames=="political",1,0)
all$fnbpolitical_nonames[is.na(all$fnbclass_nonames)]<-NA

####################
## Outputs the data that is used for individual level analysis in "Schub_MainResults_APSR.R"
# write.csv(all,"SpeakerLevel_Polished.csv")
####################



